//  z_Linux_asm.s:  - microtasking routines specifically
//                    written for Intel platforms running Linux* OS
// $Revision: 42181 $
// $Date: 2013-03-26 15:04:45 -0500 (Tue, 26 Mar 2013) $

// <copyright>
//    Copyright (c) 1997-2013 Intel Corporation.  All Rights Reserved.
//
//    Redistribution and use in source and binary forms, with or without
//    modification, are permitted provided that the following conditions
//    are met:
//
//      * Redistributions of source code must retain the above copyright
//        notice, this list of conditions and the following disclaimer.
//      * Redistributions in binary form must reproduce the above copyright
//        notice, this list of conditions and the following disclaimer in the
//        documentation and/or other materials provided with the distribution.
//      * Neither the name of Intel Corporation nor the names of its
//        contributors may be used to endorse or promote products derived
//        from this software without specific prior written permission.
//
//    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
//    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
//    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
//    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
//    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
//    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
//    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
//    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
//    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
//    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
//    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
//
//------------------------------------------------------------------------
//
//    Portions of this software are protected under the following patents:
//        U.S. Patent 5,812,852
//        U.S. Patent 6,792,599
//        U.S. Patent 7,069,556
//        U.S. Patent 7,328,433
//        U.S. Patent 7,500,242
//
// </copyright>

// -----------------------------------------------------------------------
// macros
// -----------------------------------------------------------------------

#if KMP_ARCH_X86 || KMP_ARCH_X86_64

#if __MIC__ || __MIC2__
//
// the 'delay r16/r32/r64' should be used instead of the 'pause'.
// The delay operation has the effect of removing the current thread from
// the round-robin HT mechanism, and therefore speeds up the issue rate of
// the other threads on the same core.
//
// A value of 0 works fine for <= 2 threads per core, but causes the EPCC
// barrier time to increase greatly for 3 or more threads per core.
//
// A value of 100 works pretty well for up to 4 threads per core, but isn't
// quite as fast as 0 for 2 threads per core.
//
// We need to check what happens for oversubscription / > 4 threads per core.
// It is possible that we need to pass the delay value in as a parameter
// that the caller determines based on the total # threads / # cores.
//
//.macro pause_op
//	mov    $100, %rax
//	delay  %rax
//.endm
#else
# define pause_op   .byte 0xf3,0x90
#endif // __MIC__ || __MIC2__

# if defined __APPLE__ && defined __MACH__
#  define KMP_PREFIX_UNDERSCORE(x) _##x  // extra underscore for OS X* symbols
.macro ALIGN
	.align $0
.endmacro
.macro DEBUG_INFO
/* Not sure what .size does in icc, not sure if we need to do something
   similar for OS X*.
*/
.endmacro
.macro PROC
	ALIGN  4
	.globl KMP_PREFIX_UNDERSCORE($0)
KMP_PREFIX_UNDERSCORE($0):
.endmacro
# else // defined __APPLE__ && defined __MACH__
#  define KMP_PREFIX_UNDERSCORE(x) x  // no extra underscore for Linux* OS symbols
.macro ALIGN size
	.align 1<<(\size)
.endm
.macro DEBUG_INFO proc
// Not sure why we need .type and .size for the functions
	.align 16
	.type  \proc,@function
        .size  \proc,.-\proc
.endm
.macro PROC proc
	ALIGN  4
        .globl KMP_PREFIX_UNDERSCORE(\proc)
KMP_PREFIX_UNDERSCORE(\proc):
.endm
# endif // defined __APPLE__ && defined __MACH__
#endif // __i386 || defined __x86_64


// -----------------------------------------------------------------------
// data
// -----------------------------------------------------------------------

#ifdef KMP_GOMP_COMPAT

//
// Support for unnamed common blocks.
//
// Because the symbol ".gomp_critical_user_" contains a ".", we have to
// put this stuff in assembly.
//

#if KMP_ARCH_X86
# if defined __APPLE__ && defined __MACH__
        .data
        .comm .gomp_critical_user_,32
        .data
        .globl ___kmp_unnamed_critical_addr
___kmp_unnamed_critical_addr:
        .long .gomp_critical_user_
# else /* Linux* OS */
        .data
        .comm .gomp_critical_user_,32,8
        .data
	ALIGN 4
        .global __kmp_unnamed_critical_addr
__kmp_unnamed_critical_addr:
        .4byte .gomp_critical_user_
        .type __kmp_unnamed_critical_addr,@object
        .size __kmp_unnamed_critical_addr,4
#endif /* defined __APPLE__ && defined __MACH__ */
#endif /* KMP_ARCH_X86 */

#if KMP_ARCH_X86_64
# if defined __APPLE__ && defined __MACH__
        .data
        .comm .gomp_critical_user_,32
        .data
        .globl ___kmp_unnamed_critical_addr
___kmp_unnamed_critical_addr:
        .quad .gomp_critical_user_
# else /* Linux* OS */
        .data
        .comm .gomp_critical_user_,32,8
        .data
	ALIGN 8
        .global __kmp_unnamed_critical_addr
__kmp_unnamed_critical_addr:
        .8byte .gomp_critical_user_
        .type __kmp_unnamed_critical_addr,@object
        .size __kmp_unnamed_critical_addr,8
#endif /* defined __APPLE__ && defined __MACH__ */
#endif /* KMP_ARCH_X86_64 */

#endif /* KMP_GOMP_COMPAT */


#if KMP_ARCH_X86

// -----------------------------------------------------------------------
// microtasking routines specifically written for IA-32 architecture
// running Linux* OS
// -----------------------------------------------------------------------
//

	.ident "Intel Corporation"
	.data
	ALIGN 4
// void
// __kmp_x86_pause( void );
//

        .text
	PROC  __kmp_x86_pause

        pause_op
        ret

	DEBUG_INFO __kmp_x86_pause

//
// void
// __kmp_x86_cpuid( int mode, int mode2, void *cpuid_buffer );
//
	PROC  __kmp_x86_cpuid

	pushl %ebp
	movl  %esp,%ebp
        pushl %edi
        pushl %ebx
        pushl %ecx
        pushl %edx

	movl  8(%ebp), %eax
	movl  12(%ebp), %ecx
	cpuid				// Query the CPUID for the current processor

	movl  16(%ebp), %edi
	movl  %eax, 0(%edi)
	movl  %ebx, 4(%edi)
	movl  %ecx, 8(%edi)
	movl  %edx, 12(%edi)

        popl  %edx
        popl  %ecx
        popl  %ebx
        popl  %edi
        movl  %ebp, %esp
        popl  %ebp
	ret

	DEBUG_INFO __kmp_x86_cpuid


//------------------------------------------------------------------------
//
// kmp_int32
// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
//

        PROC      __kmp_test_then_add32

        movl      4(%esp), %ecx
        movl      8(%esp), %eax
        lock
        xaddl     %eax,(%ecx)
        ret

	DEBUG_INFO __kmp_test_then_add32

//------------------------------------------------------------------------
//
// FUNCTION __kmp_xchg_fixed8
//
// kmp_int32
// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
//
// parameters:
// 	p:	4(%esp)
// 	d:	8(%esp)
//
// return:	%al

        PROC  __kmp_xchg_fixed8

        movl      4(%esp), %ecx    // "p"
        movb      8(%esp), %al	// "d"

        lock
        xchgb     %al,(%ecx)
        ret

        DEBUG_INFO __kmp_xchg_fixed8


//------------------------------------------------------------------------
//
// FUNCTION __kmp_xchg_fixed16
//
// kmp_int16
// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
//
// parameters:
// 	p:	4(%esp)
// 	d:	8(%esp)
// return:     %ax

        PROC  __kmp_xchg_fixed16

        movl      4(%esp), %ecx    // "p"
        movw      8(%esp), %ax	// "d"

        lock
        xchgw     %ax,(%ecx)
        ret

        DEBUG_INFO __kmp_xchg_fixed16


//------------------------------------------------------------------------
//
// FUNCTION __kmp_xchg_fixed32
//
// kmp_int32
// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
//
// parameters:
// 	p:	4(%esp)
// 	d:	8(%esp)
//
// return:	%eax

        PROC  __kmp_xchg_fixed32

        movl      4(%esp), %ecx    // "p"
        movl      8(%esp), %eax	// "d"

        lock
        xchgl     %eax,(%ecx)
        ret

        DEBUG_INFO __kmp_xchg_fixed32


//
// kmp_int8
// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
//

        PROC  __kmp_compare_and_store8

        movl      4(%esp), %ecx
        movb      8(%esp), %al
        movb      12(%esp), %dl
        lock
        cmpxchgb  %dl,(%ecx)
        sete      %al           // if %al == (%ecx) set %al = 1 else set %al = 0
        and       $1, %eax      // sign extend previous instruction
        ret

        DEBUG_INFO __kmp_compare_and_store8

//
// kmp_int16
// __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
//

        PROC  __kmp_compare_and_store16

        movl      4(%esp), %ecx
        movw      8(%esp), %ax
        movw      12(%esp), %dx
        lock
        cmpxchgw  %dx,(%ecx)
        sete      %al           // if %ax == (%ecx) set %al = 1 else set %al = 0
        and       $1, %eax      // sign extend previous instruction
        ret

        DEBUG_INFO __kmp_compare_and_store16

//
// kmp_int32
// __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
//

        PROC  __kmp_compare_and_store32

        movl      4(%esp), %ecx
        movl      8(%esp), %eax
        movl      12(%esp), %edx
        lock
        cmpxchgl  %edx,(%ecx)
        sete      %al           // if %eax == (%ecx) set %al = 1 else set %al = 0
        and       $1, %eax      // sign extend previous instruction
        ret

        DEBUG_INFO __kmp_compare_and_store32

//
// kmp_int32
// __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
//
        PROC  __kmp_compare_and_store64

        pushl     %ebp
        movl      %esp, %ebp
        pushl     %ebx
        pushl     %edi
        movl      8(%ebp), %edi
        movl      12(%ebp), %eax        // "cv" low order word
        movl      16(%ebp), %edx        // "cv" high order word
        movl      20(%ebp), %ebx        // "sv" low order word
        movl      24(%ebp), %ecx        // "sv" high order word
        lock
        cmpxchg8b (%edi)
        sete      %al           // if %edx:eax == (%edi) set %al = 1 else set %al = 0
        and       $1, %eax      // sign extend previous instruction
        popl      %edi
        popl      %ebx
        movl      %ebp, %esp
        popl      %ebp
        ret

        DEBUG_INFO __kmp_compare_and_store64

//
// kmp_int8
// __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
//

        PROC  __kmp_compare_and_store_ret8

        movl      4(%esp), %ecx
        movb      8(%esp), %al
        movb      12(%esp), %dl
        lock
        cmpxchgb  %dl,(%ecx)
        ret

        DEBUG_INFO __kmp_compare_and_store_ret8

//
// kmp_int16
// __kmp_compare_and_store_ret16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
//

        PROC  __kmp_compare_and_store_ret16

        movl      4(%esp), %ecx
        movw      8(%esp), %ax
        movw      12(%esp), %dx
        lock
        cmpxchgw  %dx,(%ecx)
        ret

        DEBUG_INFO __kmp_compare_and_store_ret16

//
// kmp_int32
// __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
//

        PROC  __kmp_compare_and_store_ret32

        movl      4(%esp), %ecx
        movl      8(%esp), %eax
        movl      12(%esp), %edx
        lock
        cmpxchgl  %edx,(%ecx)
        ret

        DEBUG_INFO __kmp_compare_and_store_ret32

//
// kmp_int64
// __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
//
        PROC  __kmp_compare_and_store_ret64

        pushl     %ebp
        movl      %esp, %ebp
        pushl     %ebx
        pushl     %edi
        movl      8(%ebp), %edi
        movl      12(%ebp), %eax        // "cv" low order word
        movl      16(%ebp), %edx        // "cv" high order word
        movl      20(%ebp), %ebx        // "sv" low order word
        movl      24(%ebp), %ecx        // "sv" high order word
        lock
        cmpxchg8b (%edi)
        popl      %edi
        popl      %ebx
        movl      %ebp, %esp
        popl      %ebp
        ret

        DEBUG_INFO __kmp_compare_and_store_ret64


//------------------------------------------------------------------------
//
// FUNCTION __kmp_xchg_real32
//
// kmp_real32
// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
//
// parameters:
// 	addr:	4(%esp)
// 	data:	8(%esp)
//
// return:	%eax


        PROC  __kmp_xchg_real32

        pushl   %ebp
        movl    %esp, %ebp
        subl    $4, %esp
        pushl   %esi

        movl    4(%ebp), %esi
        flds    (%esi)
                        // load <addr>
        fsts    -4(%ebp)
                        // store old value

        movl    8(%ebp), %eax

        lock
        xchgl   %eax, (%esi)

        flds    -4(%ebp)
                        // return old value

        popl    %esi
        movl    %ebp, %esp
        popl    %ebp
        ret

        DEBUG_INFO __kmp_xchg_real32


//------------------------------------------------------------------------
//
// FUNCTION __kmp_test_then_add_real32
//
// kmp_real32
// __kmp_test_then_add_real32( volatile kmp_real32 *addr, kmp_real32 data );
//

        PROC  __kmp_test_then_add_real32

_addr = 8
_data = 12
_old_value = -4
_new_value = -8

        pushl   %ebp
        movl    %esp, %ebp
        subl    $8, %esp
        pushl   %esi
        pushl   %ebx
        movl    _addr(%ebp), %esi
L22:
        flds    (%esi)
                        // load <addr>
        fsts    _old_value(%ebp)
                        // store into old_value
        fadds   _data(%ebp)
        fstps   _new_value(%ebp)
                        // new_value = old_value + data

        movl    _old_value(%ebp), %eax
                        // load old_value
        movl    _new_value(%ebp), %ebx
                        // load new_value

	lock
	cmpxchgl %ebx,(%esi)
                        // Compare %EAX with <addr>.  If equal set
                        // ZF and load %EBX into <addr>.  Else, clear
                        // ZF and load <addr> into %EAX.
        jnz     L22


        flds    _old_value(%ebp)
                        // return old_value
        popl    %ebx
        popl    %esi
        movl    %ebp, %esp
        popl    %ebp
        ret

        DEBUG_INFO __kmp_test_then_add_real32

//------------------------------------------------------------------------
//
// FUNCTION __kmp_test_then_add_real64
//
// kmp_real64
// __kmp_test_then_add_real64( volatile kmp_real64 *addr, kmp_real64 data );
//
        PROC  __kmp_test_then_add_real64

_addr = 8
_data = 12
_old_value = -8
_new_value = -16

        pushl   %ebp
        movl    %esp, %ebp
        subl    $16, %esp
        pushl   %esi
        pushl   %ebx
        pushl   %ecx
        pushl   %edx
        movl    _addr(%ebp), %esi
L44:
        fldl    (%esi)
                        // load <addr>
        fstl    _old_value(%ebp)
                        // store into old_value
        faddl   _data(%ebp)
        fstpl   _new_value(%ebp)
                        // new_value = old_value + data

        movl    _old_value+4(%ebp), %edx
        movl    _old_value(%ebp), %eax
                        // load old_value
        movl    _new_value+4(%ebp), %ecx
        movl    _new_value(%ebp), %ebx
                        // load new_value

	lock
	cmpxchg8b (%esi)
                        // Compare %EDX:%EAX with <addr>.  If equal set
                        // ZF and load %ECX:%EBX into <addr>.  Else, clear
                        // ZF and load <addr> into %EDX:%EAX.
        jnz     L44


        fldl    _old_value(%ebp)
                        // return old_value
        popl    %edx
        popl    %ecx
        popl    %ebx
        popl    %esi
        movl    %ebp, %esp
        popl    %ebp
        ret

        DEBUG_INFO __kmp_test_then_add_real64

        PROC  __kmp_test_then_add_complex32

_addr = 8
_data_real = 12
_data_imag = 16
_old_real = -8
_old_imag = -4
_new_real = -16
_new_imag = -12

        pushl   %ebp
        movl    %esp, %ebp
        subl    $16, %esp
        pushl   %esi
        pushl   %ebx
        pushl   %ecx
        movl    _addr(%ebp), %esi
LOOP:
	flds    (%esi)
	fsts    _old_real(%ebp)
	movl    _old_real(%ebp), %eax
	fadds   _data_real(%ebp)
	fstps   _new_real(%ebp)
	movl    _new_real(%ebp), %ebx
	flds    4(%esi)
	fsts    _old_imag(%ebp)
	movl    _old_imag(%ebp), %edx
	fadds   _data_imag(%ebp)
	fstps   _new_imag(%ebp)
	movl    _new_imag(%ebp), %ecx

	lock
	cmpxchg8b (%esi)
                        // Compare %EDX:%EAX with <addr>.  If equal set
                        // ZF and load %ECX:%EBX into <addr>.  Else, clear
                        // ZF and load <addr> into %EDX:%EAX.
        jnz     LOOP
        popl    %ecx
        popl    %ebx
        popl    %esi
        movl    %ebp, %esp
        popl    %ebp
        ret

        DEBUG_INFO __kmp_test_then_add_complex32


//------------------------------------------------------------------------
//
// FUNCTION __kmp_load_x87_fpu_control_word
//
// void
// __kmp_load_x87_fpu_control_word( kmp_int16 *p );
//
// parameters:
// 	p:	4(%esp)
//

        PROC  __kmp_load_x87_fpu_control_word

        movl  4(%esp), %eax
        fldcw (%eax)
        ret

        DEBUG_INFO __kmp_load_x87_fpu_control_word


//------------------------------------------------------------------------
//
// FUNCTION __kmp_store_x87_fpu_control_word
//
// void
// __kmp_store_x87_fpu_control_word( kmp_int16 *p );
//
// parameters:
// 	p:	4(%esp)
//

        PROC  __kmp_store_x87_fpu_control_word

        movl  4(%esp), %eax
        fstcw (%eax)
        ret

        DEBUG_INFO __kmp_store_x87_fpu_control_word


//------------------------------------------------------------------------
//
// FUNCTION __kmp_clear_x87_fpu_status_word
//
// void
// __kmp_clear_x87_fpu_status_word();
//
//

        PROC  __kmp_clear_x87_fpu_status_word

        fnclex
        ret

        DEBUG_INFO __kmp_clear_x87_fpu_status_word

//------------------------------------------------------------------------
//
// FUNCTION __kmp_load_mxcsr
//
// void
// __kmp_load_mxcsr( kmp_int32 *p );
//
// parameters:
// 	p:	4(%esp)
//

        PROC  __kmp_load_mxcsr

        movl  4(%esp), %eax
        ldmxcsr (%eax)
        ret

        DEBUG_INFO __kmp_load_mxcsr


//------------------------------------------------------------------------
//
// FUNCTION __kmp_store_mxcsr
//
// void
// __kmp_store_mxcsr( kmp_int32 *p );
//
// parameters:
// 	p:	4(%esp)
//

        PROC  __kmp_store_mxcsr

        movl  4(%esp), %eax
        stmxcsr (%eax)
        ret

        DEBUG_INFO __kmp_store_mxcsr

//------------------------------------------------------------------------
//
// typedef void	(*microtask_t)( int *gtid, int *tid, ... );
//
// int
// __kmp_invoke_microtask( microtask_t pkfn, int gtid, int tid,
//                         int argc, void *p_argv[] ) {
//    (*pkfn)( & gtid, & gtid, argv[0], ... );
//    return 1;
// }

// -- Begin __kmp_invoke_microtask
// mark_begin;
	PROC  __kmp_invoke_microtask

	pushl %ebp
	movl %esp,%ebp		// establish the base pointer for this routine.
	subl $8,%esp		// allocate space for two local variables.
				// These varibales are:
				//	argv: -4(%ebp)
				//	temp: -8(%ebp)
				//
	pushl %ebx		// save %ebx to use during this routine
				//
	movl 20(%ebp),%ebx	// Stack alignment - # args
	addl $2,%ebx		// #args +2  Always pass at least 2 args (gtid and tid)
	shll $2,%ebx		// Number of bytes used on stack: (#args+2)*4
	movl %esp,%eax		//
	subl %ebx,%eax		// %esp-((#args+2)*4) -> %eax -- without mods, stack ptr would be this
	movl %eax,%ebx		// Save to %ebx
	andl $0xFFFFFF80,%eax	// mask off 7 bits
	subl %eax,%ebx		// Amount to subtract from %esp
	subl %ebx,%esp		// Prepare the stack ptr --
				//   now it will be aligned on 128-byte boundary at the call

	movl 24(%ebp),%eax	// copy from p_argv[]
	movl %eax,-4(%ebp)	// into the local variable *argv.

	movl 20(%ebp),%ebx	// argc is 20(%ebp)
	shll $2,%ebx

.invoke_2:
	cmpl $0,%ebx
	jg  .invoke_4
	jmp .invoke_3
	ALIGN 2
.invoke_4:
	movl -4(%ebp),%eax
	subl $4,%ebx			// decrement argc.
	addl %ebx,%eax			// index into argv.
	movl (%eax),%edx
	pushl %edx

	jmp .invoke_2
	ALIGN 2
.invoke_3:
	leal 16(%ebp),%eax		// push & tid
	pushl %eax

	leal 12(%ebp),%eax		// push & gtid
	pushl %eax

	movl 8(%ebp),%ebx
	call *%ebx			// call (*pkfn)();

	movl $1,%eax			// return 1;

	movl -12(%ebp),%ebx		// restore %ebx
	leave
	ret

	DEBUG_INFO __kmp_invoke_microtask
// -- End  __kmp_invoke_microtask


// kmp_uint64
// __kmp_hardware_timestamp(void)
	PROC  __kmp_hardware_timestamp
	rdtsc
	ret

	DEBUG_INFO __kmp_hardware_timestamp
// -- End  __kmp_hardware_timestamp

// -----------------------------------------------------------------------
#endif /* KMP_ARCH_X86 */


#if KMP_ARCH_X86_64

// -----------------------------------------------------------------------
// microtasking routines specifically written for IA-32 architecture and
// Intel(R) 64 running Linux* OS
// -----------------------------------------------------------------------

// -- Machine type P
// mark_description "Intel Corporation";
	.ident "Intel Corporation"
// --	.file "z_Linux_asm.s"
	.data
	ALIGN 4

// AC: The following #if hiden the .text thus moving the rest of code into .data section on MIC.
// To prevent this in future .text added to every routine definition for x86_64.
#if __MIC__ || __MIC2__

#else

//------------------------------------------------------------------------
//
// FUNCTION __kmp_x86_pause
//
// void
// __kmp_x86_pause( void );
//

        .text
        PROC  __kmp_x86_pause

        pause_op
        ret

        DEBUG_INFO __kmp_x86_pause

#endif // __MIC__ || __MIC2__

//------------------------------------------------------------------------
//
// FUNCTION __kmp_x86_cpuid
//
// void
// __kmp_x86_cpuid( int mode, int mode2, void *cpuid_buffer );
//
// parameters:
// 	mode:		%edi
// 	mode2:		%esi
// 	cpuid_buffer:	%rdx

        .text
	PROC  __kmp_x86_cpuid

	pushq  %rbp
	movq   %rsp,%rbp
        pushq  %rbx			// callee-save register

	movl   %esi, %ecx		// "mode2"
	movl   %edi, %eax		// "mode"
        movq   %rdx, %rsi               // cpuid_buffer
	cpuid				// Query the CPUID for the current processor

	movl   %eax, 0(%rsi)		// store results into buffer
	movl   %ebx, 4(%rsi)
	movl   %ecx, 8(%rsi)
	movl   %edx, 12(%rsi)

        popq   %rbx			// callee-save register
        movq   %rbp, %rsp
        popq   %rbp
	ret

        DEBUG_INFO __kmp_x86_cpuid


//------------------------------------------------------------------------
//
// FUNCTION __kmp_test_then_add32
//
// kmp_int32
// __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
//
// parameters:
// 	p:	%rdi
// 	d:	%esi
//
// return:	%eax

        .text
        PROC  __kmp_test_then_add32

        movl      %esi, %eax	// "d"
        lock
        xaddl     %eax,(%rdi)
        ret

        DEBUG_INFO __kmp_test_then_add32


//------------------------------------------------------------------------
//
// FUNCTION __kmp_test_then_add64
//
// kmp_int64
// __kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 d );
//
// parameters:
// 	p:	%rdi
// 	d:	%rsi
//	return:	%rax

        .text
        PROC  __kmp_test_then_add64

        movq      %rsi, %rax	// "d"
        lock
        xaddq     %rax,(%rdi)
        ret

        DEBUG_INFO __kmp_test_then_add64


//------------------------------------------------------------------------
//
// FUNCTION __kmp_xchg_fixed8
//
// kmp_int32
// __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
//
// parameters:
// 	p:	%rdi
// 	d:	%sil
//
// return:	%al

        .text
        PROC  __kmp_xchg_fixed8

        movb      %sil, %al	// "d"

        lock
        xchgb     %al,(%rdi)
        ret

        DEBUG_INFO __kmp_xchg_fixed8


//------------------------------------------------------------------------
//
// FUNCTION __kmp_xchg_fixed16
//
// kmp_int16
// __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
//
// parameters:
// 	p:	%rdi
// 	d:	%si
// return:     %ax

        .text
        PROC  __kmp_xchg_fixed16

        movw      %si, %ax	// "d"

        lock
        xchgw     %ax,(%rdi)
        ret

        DEBUG_INFO __kmp_xchg_fixed16


//------------------------------------------------------------------------
//
// FUNCTION __kmp_xchg_fixed32
//
// kmp_int32
// __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
//
// parameters:
// 	p:	%rdi
// 	d:	%esi
//
// return:	%eax

        .text
        PROC  __kmp_xchg_fixed32

        movl      %esi, %eax	// "d"

        lock
        xchgl     %eax,(%rdi)
        ret

        DEBUG_INFO __kmp_xchg_fixed32


//------------------------------------------------------------------------
//
// FUNCTION __kmp_xchg_fixed64
//
// kmp_int64
// __kmp_xchg_fixed64( volatile kmp_int64 *p, kmp_int64 d );
//
// parameters:
// 	p:	%rdi
// 	d:	%rsi
// return:	%rax

        .text
        PROC  __kmp_xchg_fixed64

        movq      %rsi, %rax	// "d"

        lock
        xchgq     %rax,(%rdi)
        ret

        DEBUG_INFO __kmp_xchg_fixed64


//------------------------------------------------------------------------
//
// FUNCTION __kmp_compare_and_store8
//
// kmp_int8
// __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
//
// parameters:
// 	p:	%rdi
// 	cv:	%esi
//	sv:	%edx
//
// return:	%eax

        .text
        PROC  __kmp_compare_and_store8

        movb      %sil, %al	// "cv"
        lock
        cmpxchgb  %dl,(%rdi)
        sete      %al           // if %al == (%rdi) set %al = 1 else set %al = 0
        andq      $1, %rax      // sign extend previous instruction for return value
        ret

        DEBUG_INFO __kmp_compare_and_store8


//------------------------------------------------------------------------
//
// FUNCTION __kmp_compare_and_store16
//
// kmp_int16
// __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
//
// parameters:
// 	p:	%rdi
// 	cv:	%si
//	sv:	%dx
//
// return:	%eax

        .text
        PROC  __kmp_compare_and_store16

        movw      %si, %ax	// "cv"
        lock
        cmpxchgw  %dx,(%rdi)
        sete      %al           // if %ax == (%rdi) set %al = 1 else set %al = 0
        andq      $1, %rax      // sign extend previous instruction for return value
        ret

        DEBUG_INFO __kmp_compare_and_store16


//------------------------------------------------------------------------
//
// FUNCTION __kmp_compare_and_store32
//
// kmp_int32
// __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
//
// parameters:
// 	p:	%rdi
// 	cv:	%esi
//	sv:	%edx
//
// return:	%eax

        .text
        PROC  __kmp_compare_and_store32

        movl      %esi, %eax	// "cv"
        lock
        cmpxchgl  %edx,(%rdi)
        sete      %al           // if %eax == (%rdi) set %al = 1 else set %al = 0
        andq      $1, %rax      // sign extend previous instruction for return value
        ret

        DEBUG_INFO __kmp_compare_and_store32


//------------------------------------------------------------------------
//
// FUNCTION __kmp_compare_and_store64
//
// kmp_int32
// __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
//
// parameters:
// 	p:	%rdi
// 	cv:	%rsi
//	sv:	%rdx
//	return:	%eax

        .text
        PROC  __kmp_compare_and_store64

        movq      %rsi, %rax    // "cv"
        lock
        cmpxchgq  %rdx,(%rdi)
        sete      %al           // if %rax == (%rdi) set %al = 1 else set %al = 0
        andq      $1, %rax      // sign extend previous instruction for return value
        ret

        DEBUG_INFO __kmp_compare_and_store64

//------------------------------------------------------------------------
//
// FUNCTION __kmp_compare_and_store_ret8
//
// kmp_int8
// __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
//
// parameters:
// 	p:	%rdi
// 	cv:	%esi
//	sv:	%edx
//
// return:	%eax

        .text
        PROC  __kmp_compare_and_store_ret8

        movb      %sil, %al	// "cv"
        lock
        cmpxchgb  %dl,(%rdi)
        ret

        DEBUG_INFO __kmp_compare_and_store_ret8


//------------------------------------------------------------------------
//
// FUNCTION __kmp_compare_and_store_ret16
//
// kmp_int16
// __kmp_compare_and_store16_ret( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
//
// parameters:
// 	p:	%rdi
// 	cv:	%si
//	sv:	%dx
//
// return:	%eax

        .text
        PROC  __kmp_compare_and_store_ret16

        movw      %si, %ax	// "cv"
        lock
        cmpxchgw  %dx,(%rdi)
        ret

        DEBUG_INFO __kmp_compare_and_store_ret16


//------------------------------------------------------------------------
//
// FUNCTION __kmp_compare_and_store_ret32
//
// kmp_int32
// __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
//
// parameters:
// 	p:	%rdi
// 	cv:	%esi
//	sv:	%edx
//
// return:	%eax

        .text
        PROC  __kmp_compare_and_store_ret32

        movl      %esi, %eax	// "cv"
        lock
        cmpxchgl  %edx,(%rdi)
        ret

        DEBUG_INFO __kmp_compare_and_store_ret32


//------------------------------------------------------------------------
//
// FUNCTION __kmp_compare_and_store_ret64
//
// kmp_int64
// __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
//
// parameters:
// 	p:	%rdi
// 	cv:	%rsi
//	sv:	%rdx
//	return:	%eax

        .text
        PROC  __kmp_compare_and_store_ret64

        movq      %rsi, %rax    // "cv"
        lock
        cmpxchgq  %rdx,(%rdi)
        ret

        DEBUG_INFO __kmp_compare_and_store_ret64


#if __MIC__ || __MIC2__
#else

//------------------------------------------------------------------------
//
// FUNCTION __kmp_test_then_add_real32
//
// kmp_real32
// __kmp_test_then_add_real32( volatile kmp_real32 *addr, kmp_real32 data );
//
// parameters:
// 	addr:	%rdi
// 	data:	%xmm0 (lower 4 bytes)
//
// return:	%xmm0 (lower 4 bytes)

        .text
        PROC  __kmp_test_then_add_real32
1:
	movss   (%rdi), %xmm1	// load value of <addr>
	movd	%xmm1, %eax	// save old value of <addr>

	addss	%xmm0, %xmm1	// new value = old value + <data>
	movd	%xmm1, %ecx	// move new value to GP reg.

	lock
	cmpxchgl %ecx, (%rdi)	// Compare %EAX with <addr>.  If equal set
                             	// ZF and exchange %ECX with <addr>.  Else,
                                // clear ZF and load <addr> into %EAX.
        jz      2f
	pause_op
	jmp	1b
2:
	movd	%eax, %xmm0	// load old value into return register
        ret

        DEBUG_INFO __kmp_test_then_add_real32


//------------------------------------------------------------------------
//
// FUNCTION __kmp_test_then_add_real64
//
// kmp_real64
// __kmp_test_then_add_real64( volatile kmp_real64 *addr, kmp_real64 data );
//
// parameters:
//      addr:   %rdi
//      data:   %xmm0 (lower 8 bytes)
//      return: %xmm0 (lower 8 bytes)
//

        .text
        PROC  __kmp_test_then_add_real64
1:
        movlpd	(%rdi), %xmm1	// load value of <addr>
	movd	%xmm1, %rax	// save old value of <addr>

	addsd	%xmm0, %xmm1	// new value = old value + <data>
	movd	%xmm1, %rcx	// move new value to GP reg.

	lock
	cmpxchgq  %rcx, (%rdi) 	// Compare %RAX with <addr>.  If equal set
				// ZF and exchange %RCX with <addr>.  Else,
				// clear ZF and load <addr> into %RAX.
        jz      2f
	pause_op
	jmp     1b

2:
	movd	%rax, %xmm0	// load old value into return register
        ret

        DEBUG_INFO __kmp_test_then_add_real64


//------------------------------------------------------------------------
//
// FUNCTION __kmp_xchg_real32
//
// kmp_real32
// __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
//
// parameters:
// 	addr:	%rdi
// 	data:	%xmm0 (lower 4 bytes)
//
// return:	%xmm0 (lower 4 bytes)

        .text
        PROC  __kmp_xchg_real32

	movd	%xmm0, %eax	// load "data" to eax

         lock
         xchgl %eax, (%rdi)

	movd	%eax, %xmm0	// load old value into return register

        ret

        DEBUG_INFO __kmp_xchg_real32


//------------------------------------------------------------------------
//
// FUNCTION __kmp_xchg_real64
//
// kmp_real64
// __kmp_xchg_real64( volatile kmp_real64 *addr, kmp_real64 data );
//
// parameters:
//      addr:   %rdi
//      data:   %xmm0 (lower 8 bytes)
//      return: %xmm0 (lower 8 bytes)
//

        .text
        PROC  __kmp_xchg_real64

	movd	%xmm0, %rax	// load "data" to rax

         lock
	xchgq  %rax, (%rdi)

	movd	%rax, %xmm0	// load old value into return register
        ret

        DEBUG_INFO __kmp_xchg_real64


#endif // __MIC__ || __MIC2__


//------------------------------------------------------------------------
//
// FUNCTION __kmp_load_x87_fpu_control_word
//
// void
// __kmp_load_x87_fpu_control_word( kmp_int16 *p );
//
// parameters:
// 	p:	%rdi
//

        .text
        PROC  __kmp_load_x87_fpu_control_word

        fldcw (%rdi)
        ret

        DEBUG_INFO __kmp_load_x87_fpu_control_word


//------------------------------------------------------------------------
//
// FUNCTION __kmp_store_x87_fpu_control_word
//
// void
// __kmp_store_x87_fpu_control_word( kmp_int16 *p );
//
// parameters:
// 	p:	%rdi
//

        .text
        PROC  __kmp_store_x87_fpu_control_word

        fstcw (%rdi)
        ret

        DEBUG_INFO __kmp_store_x87_fpu_control_word


//------------------------------------------------------------------------
//
// FUNCTION __kmp_clear_x87_fpu_status_word
//
// void
// __kmp_clear_x87_fpu_status_word();
//
//

        .text
        PROC  __kmp_clear_x87_fpu_status_word

#if __MIC__ || __MIC2__
// TODO: remove the workaround for problem with fnclex instruction (no CQ known)
        fstenv  -32(%rsp)              // store FP env
        andw    $~0x80ff, 4-32(%rsp)   // clear 0-7,15 bits of FP SW
        fldenv  -32(%rsp)              // load FP env back
        ret
#else
        fnclex
        ret
#endif

        DEBUG_INFO __kmp_clear_x87_fpu_status_word


#if __MIC__ || __MIC2__

// LRB1:
// fp87: non-desired instruction on LRB1
// ldmxscr and stmxscr: do not exist on LRB1

#else

//------------------------------------------------------------------------
//
// FUNCTION __kmp_load_mxcsr
//
// void
// __kmp_load_mxcsr( kmp_int32 *p );
//
// parameters:
// 	p:	%rdi
//

        .text
        PROC  __kmp_load_mxcsr

        ldmxcsr (%rdi)
        ret

        DEBUG_INFO __kmp_load_mxcsr


//------------------------------------------------------------------------
//
// FUNCTION __kmp_store_mxcsr
//
// void
// __kmp_store_mxcsr( kmp_int32 *p );
//
// parameters:
// 	p:	%rdi
//

        .text
        PROC  __kmp_store_mxcsr

        stmxcsr (%rdi)
        ret

        DEBUG_INFO __kmp_store_mxcsr

#endif // __MIC__ || __MIC2__


//------------------------------------------------------------------------
//
// typedef void	(*microtask_t)( int *gtid, int *tid, ... );
//
// int
// __kmp_invoke_microtask( void (*pkfn) (int *gtid, int *tid, ...),
//		           int gtid, int tid,
//                         int argc, void *p_argv[] ) {
//    (*pkfn)( & gtid, & tid, argv[0], ... );
//    return 1;
// }
//
// note:
//	at call to pkfn must have %rsp 128-byte aligned for compiler
//
// parameters:
//      %rdi:  	pkfn
//	%esi:	gtid
//	%edx:	tid
//	%ecx:	argc
//	%r8:	p_argv
//	%r9:	&exit_frame
//
// locals:
//	__gtid:	gtid parm pushed on stack so can pass &gtid to pkfn
//	__tid:	tid parm pushed on stack so can pass &tid to pkfn
//
// reg temps:
//	%rax:	used all over the place
//	%rdx:	used in stack pointer alignment calculation
//	%r11:	used to traverse p_argv array
//	%rsi:	used as temporary for stack parameters
//		used as temporary for number of pkfn parms to push
//	%rbx:	used to hold pkfn address, and zero constant, callee-save
//
// return:	%eax 	(always 1/TRUE)
//

__gtid = -16
__tid = -24

// -- Begin __kmp_invoke_microtask
// mark_begin;
        .text
	PROC  __kmp_invoke_microtask

	pushq 	%rbp		// save base pointer
	movq 	%rsp,%rbp	// establish the base pointer for this routine.
// begin OMPT SUPPORT
	movq	%rbp, (%r9)	// save exit_frame
// end OMPT SUPPORT

	pushq 	%rbx		// %rbx is callee-saved register

	pushq	%rsi		// Put gtid on stack so can pass &tgid to pkfn
	pushq	%rdx		// Put tid on stack so can pass &tid to pkfn

	movq	%rcx, %rax	// Stack alignment calculation begins; argc -> %rax
	movq	$0, %rbx	// constant for cmovs later
	subq	$4, %rax	// subtract four args passed in registers to pkfn
#if __MIC__ || __MIC2__
	js	L_kmp_0		// jump to movq
	jmp	L_kmp_0_exit	// jump ahead
L_kmp_0:
	movq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
L_kmp_0_exit:
#else
	cmovsq	%rbx, %rax	// zero negative value in %rax <- max(0, argc-4)
#endif // __MIC__ || __MIC2__

	movq	%rax, %rsi	// save max(0, argc-4) -> %rsi for later
	shlq 	$3, %rax	// Number of bytes used on stack: max(0, argc-4)*8

	movq 	%rsp, %rdx	//
	subq 	%rax, %rdx	// %rsp-(max(0,argc-4)*8) -> %rdx --
				// without align, stack ptr would be this
	movq 	%rdx, %rax	// Save to %rax

	andq 	$0xFFFFFFFFFFFFFF80, %rax  // mask off lower 7 bits (128 bytes align)
	subq 	%rax, %rdx	// Amount to subtract from %rsp
	subq 	%rdx, %rsp	// Prepare the stack ptr --
				// now %rsp will align to 128-byte boundary at call site

				// setup pkfn parameter reg and stack
	movq	%rcx, %rax	// argc -> %rax
	cmpq	$0, %rsi
	je	L_kmp_invoke_pass_parms	// jump ahead if no parms to push
	shlq	$3, %rcx	// argc*8 -> %rcx
	movq 	%r8, %rdx	// p_argv -> %rdx
	addq	%rcx, %rdx	// &p_argv[argc] -> %rdx

	movq	%rsi, %rcx	// max (0, argc-4) -> %rcx

L_kmp_invoke_push_parms:	// push nth - 7th parms to pkfn on stack
	subq	$8, %rdx	// decrement p_argv pointer to previous parm
	movq	(%rdx), %rsi	// p_argv[%rcx-1] -> %rsi
	pushq	%rsi		// push p_argv[%rcx-1] onto stack (reverse order)
	subl	$1, %ecx

// C69570: "X86_64_RELOC_BRANCH not supported" error at linking on mac_32e
//		if the name of the label that is an operand of this jecxz starts with a dot (".");
//	   Apple's linker does not support 1-byte length relocation;
//         Resolution: replace all .labelX entries with L_labelX.

	jecxz   L_kmp_invoke_pass_parms  // stop when four p_argv[] parms left
	jmp	L_kmp_invoke_push_parms

	ALIGN 3
L_kmp_invoke_pass_parms:	// put 1st - 6th parms to pkfn in registers.
				// order here is important to avoid trashing
				// registers used for both input and output parms!

// begin OMPT SUPPORT
//	leaq    -8(%rsp),%r11   // Address after the return address has been pushed (r11 unused here)
//	movq	%r11, (%r9)	// save exit_frame
//	movq	%rsp, (%r9)	// save exit_frame
// end OMPT SUPPORT

	movq	%rdi, %rbx	// pkfn -> %rbx
	leaq	__gtid(%rbp), %rdi // &gtid -> %rdi (store 1st parm to pkfn)
	leaq	__tid(%rbp), %rsi  // &tid -> %rsi (store 2nd parm to pkfn)

	movq	%r8, %r11	// p_argv -> %r11

#if __MIC__ || __MIC2__
	cmpq	$4, %rax	// argc >= 4?
	jns	L_kmp_4		// jump to movq
	jmp	L_kmp_4_exit    // jump ahead
L_kmp_4:
	movq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)
L_kmp_4_exit:

	cmpq	$3, %rax	// argc >= 3?
	jns	L_kmp_3		// jump to movq
	jmp	L_kmp_3_exit    // jump ahead
L_kmp_3:
	movq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)
L_kmp_3_exit:

	cmpq	$2, %rax	// argc >= 2?
	jns	L_kmp_2		// jump to movq
	jmp	L_kmp_2_exit    // jump ahead
L_kmp_2:
	movq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)
L_kmp_2_exit:

	cmpq	$1, %rax	// argc >= 1?
	jns	L_kmp_1		// jump to movq
	jmp	L_kmp_1_exit    // jump ahead
L_kmp_1:
	movq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
L_kmp_1_exit:
#else
	cmpq	$4, %rax	// argc >= 4?
	cmovnsq	24(%r11), %r9	// p_argv[3] -> %r9 (store 6th parm to pkfn)

	cmpq	$3, %rax	// argc >= 3?
	cmovnsq	16(%r11), %r8	// p_argv[2] -> %r8 (store 5th parm to pkfn)

	cmpq	$2, %rax	// argc >= 2?
	cmovnsq	8(%r11), %rcx	// p_argv[1] -> %rcx (store 4th parm to pkfn)

	cmpq	$1, %rax	// argc >= 1?
	cmovnsq	(%r11), %rdx	// p_argv[0] -> %rdx (store 3rd parm to pkfn)
#endif // __MIC__ || __MIC2__

	call	*%rbx		// call (*pkfn)();
	movq	$1, %rax	// move 1 into return register;

	movq	-8(%rbp), %rbx	// restore %rbx	using %rbp since %rsp was modified
	movq 	%rbp, %rsp	// restore stack pointer
	popq 	%rbp		// restore frame pointer
	ret

	DEBUG_INFO __kmp_invoke_microtask
// -- End  __kmp_invoke_microtask

// kmp_uint64
// __kmp_hardware_timestamp(void)
        .text
	PROC  __kmp_hardware_timestamp
	rdtsc
	shlq    $32, %rdx
	orq     %rdx, %rax
	ret

	DEBUG_INFO __kmp_hardware_timestamp
// -- End  __kmp_hardware_timestamp

//------------------------------------------------------------------------
//
// FUNCTION __kmp_bsr32
//
// int
// __kmp_bsr32( int );
//

        .text
        PROC  __kmp_bsr32

        bsr    %edi,%eax
        ret

        DEBUG_INFO __kmp_bsr32

	
// -----------------------------------------------------------------------
#endif /* KMP_ARCH_X86_64 */
